{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "038dfa04",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "请输入当前文件夹路径:D:\\Davion\n",
      "请输入文件名字:test.txt\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\Davion\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.580 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done!\n"
     ]
    }
   ],
   "source": [
    "# wordfreq-没有同义词\n",
    "\n",
    "import os\n",
    "import jieba\n",
    "import jieba.posseg as psg\n",
    "import re\n",
    "import pandas as pd\n",
    "def get_stop_dict(file):\n",
    "    content = open(file,encoding=\"utf-8\")\n",
    "    word_list = []\n",
    "    for c in content:\n",
    "        c = re.sub('\\n|\\r','',c)\n",
    "        word_list.append(c)\n",
    "    return word_list\n",
    "\n",
    "file_path = input(\"请输入当前文件夹路径:\")\n",
    "os.chdir(file_path)\n",
    "\n",
    "stop_file = \"stopwordlist.txt\"\n",
    "user_file = \"add_word_list.txt\"\n",
    "\n",
    "stop_words = get_stop_dict(stop_file)\n",
    "file_name = input(\"请输入文件名字:\")\n",
    "text = open(file_name,encoding=\"utf-8\").read()\n",
    "jieba.load_userdict(user_file)\n",
    "text_lines  = text.split('\\n')\n",
    "\n",
    "flag_list = ['n','nz','vn']#a,形容词，v,形容词\n",
    "counts={}\n",
    "\n",
    "for line in text_lines:\n",
    "    line_seg = psg.cut(line)\n",
    "    for word_flag in line_seg:\n",
    "        word = re.sub(\"[^\\u4e00-\\u9fa5]\",\"\",word_flag.word)\n",
    "        if word_flag.flag in flag_list and len(word)>1 and word not in stop_words:\n",
    "            counts[word]=counts.get(word,0)+1\n",
    "\n",
    "word_freq = pd.DataFrame({'word':list(counts.keys()),'freq':list(counts.values())})\n",
    "word_freq = word_freq.sort_values(by='freq',ascending=False)\n",
    "word_freq.to_excel(\"word_freq.xlsx\",index=False)\n",
    "\n",
    "print(\"done!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7f9a8551",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "请输入当前文件夹路径:D:\\Davion\n",
      "请输入文件名字:test.txt\n",
      "done!\n"
     ]
    }
   ],
   "source": [
    "# wordfreq-合并同义词\n",
    "\n",
    "import os\n",
    "import jieba\n",
    "import jieba.posseg as psg\n",
    "import re\n",
    "import pandas as pd\n",
    "def get_stop_dict(file):\n",
    "    content = open(file,encoding=\"utf-8\")\n",
    "    word_list = []\n",
    "    for c in content:\n",
    "        c = re.sub('\\n|\\r','',c)\n",
    "        word_list.append(c)\n",
    "    return word_list\n",
    "\n",
    "file_path = input(\"请输入当前文件夹路径:\")\n",
    "os.chdir(file_path)\n",
    "\n",
    "stop_file = \"stopwordlist.txt\"\n",
    "user_file = \"add_word_list.txt\"\n",
    "#add_word_list.txt内容格式是\"单词\"+\"空格\"+\"n\"，或者其他vn、a、nr等词性\n",
    "synonym_file = \"synonym_list.xlsx\"\n",
    "#第一列为'origin'，第二列为'new'，分别对应原始词语和替换后的词语\n",
    "\n",
    "stop_words = get_stop_dict(stop_file)\n",
    "synonym_words = pd.read_excel(\"synonym_list.xlsx\")\n",
    "synonym_origin = list(synonym_words['origin'])\n",
    "synonym_new = list(synonym_words['new'])\n",
    "\n",
    "file_name = input(\"请输入文件名字:\")\n",
    "text = open(file_name,encoding=\"utf-8\").read()\n",
    "jieba.load_userdict(user_file)\n",
    "text_lines  = text.split('\\n')\n",
    "\n",
    "\n",
    "flag_list = ['n','nz','vn']#a,形容词，v,形容词\n",
    "counts={}\n",
    "\n",
    "for line in text_lines:\n",
    "    line_seg = psg.cut(line)\n",
    "    for word_flag in line_seg:\n",
    "        word = re.sub(\"[^\\u4e00-\\u9fa5]\",\"\",word_flag.word)\n",
    "        if word_flag.flag in flag_list and len(word)>1 and word not in stop_words:\n",
    "            if word in synonym_origin:\n",
    "                index = synonym_origin.index(word)\n",
    "                word = synonym_new[index]\n",
    "            counts[word]=counts.get(word,0)+1\n",
    "\n",
    "word_freq = pd.DataFrame({'word':list(counts.keys()),'freq':list(counts.values())})\n",
    "word_freq = word_freq.sort_values(by='freq',ascending=False)\n",
    "word_freq.to_excel(\"word_freq1.xlsx\",index=False)\n",
    "\n",
    "print(\"done!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8ebfb4c4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "请输入当前文件夹路径:D:\\Davion\n",
      "请输入词频excel文件名：word_freq.xlsx\n",
      "请输入同义词txt文件名：synonym_list.txt\n",
      "done!\n"
     ]
    }
   ],
   "source": [
    "# wordfreq-合并同义词2\n",
    "\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "file_path = input(\"请输入当前文件夹路径:\")\n",
    "os.chdir(file_path)\n",
    "\n",
    "file_name = input(\"请输入词频excel文件名：\")#列名为word,freq\n",
    "df = pd.read_excel(file_name)\n",
    "syn_name = input(\"请输入同义词txt文件名：\")\n",
    "#每行为互为同义词的几个词语，空格隔开(公司 企业 集团)，行首的词语为最终替换词语(最终全部合并为“公司”)\n",
    "txt = open(syn_name,encoding=\"utf-8\").read()\n",
    "txts = txt.split(\"\\n\")\n",
    "\n",
    "for line in txts:\n",
    "    words = line.split(\" \")\n",
    "    dic = {}\n",
    "    for word in words:\n",
    "        dic[word]=words[0]\n",
    "    df['word']=df['word'].replace(dic)\n",
    "\n",
    "df['new_freq']=df.groupby(['word'], as_index=False).cumsum()\n",
    "df = df.drop_duplicates(subset=['word'], keep='last')\n",
    "df=df[['word','new_freq']]\n",
    "\n",
    "df.to_excel(\"word_freq2.xlsx\",index=False)#保存新的词频文件\n",
    "print(\"done!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23a31667",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
